Let's start importing some basic packages.

In [20]:
from sodapy import Socrata
import json
import pickle
import ast
import os
import sys
import itertools
from random import randrange

import pandas as pd
import numpy as np
import psycopg2

from datetime import datetime, timedelta
import time
import dateutil

The current path of the application and its parent directory are important to move around the different folders of the application.

In [8]:
currentdir = os.getcwd()
parentdir = os.path.abspath(os.path.join(currentdir, os.pardir))
Out[8]:
'C:\\Users\\rcmpo\\OneDrive\\Escritorio\\TFM\\application'

Let's add the utils folder to the Python runtime environment using 'sys' library. It allows importing functions within our application without writing them again here in the notebook.

In [9]:
sys.path.insert(0, parentdir + '\\' + 'utils')
In [14]:
from db_utils import (DBConnection, 
                      get_all_link_ids, 
                      get_all_boroughs, 
                      get_coordinates_from_link_id, 
                      get_borough_from_link_id, 
                      get_centroid_from_borough, 
                      get_most_recent_date_stored_for_each_link_id)

from db_utils import (SocrataAPI, 
                      query_flow_to_store_data_in_database, 
                      get_avg_and_quantiles_speed_from_historical_data, 
                      get_real_time_traffic_data)

Our Prophet model needs information about when New York Knicks plays at home in Madison Square Garden. Using 'requests' library and navigating around the NBA official website we can find this URL where all matches of the season were included. We create a list with all the start hour of NYK home matches.

In [15]:
import urllib.request, json 
with urllib.request.urlopen('https://es.global.nba.com/stats2/team/schedule.json?countryCode=ES&locale=es&teamCode=knicks') as url:
    data = json.loads(url.read().decode())
    
home_dates = []
    
for month in range(len(data['payload']['monthGroups'])):
    for match in range(len(data['payload']['monthGroups'][month]['games'])):
        if data['payload']['monthGroups'][month]['games'][match]['homeTeam']['profile']['abbr'] == 'NYK':
            home_dates.append(data['payload']['monthGroups'][month]['games'][match]['profile']['dateTimeEt'])

home_dates = [dateutil.parser.isoparse(x).replace(minute = 0).isoformat() if dateutil.parser.isoparse(x).minute == 30 else dateutil.parser.isoparse(x).isoformat() for x in home_dates] # Check that datetimes follow datetime format.
home_dates
Out[15]:
['2021-10-05T19:00:00',
 '2021-10-13T19:00:00',
 '2021-10-15T19:00:00',
 '2021-10-20T19:00:00',
 '2021-10-24T19:00:00',
 '2021-10-26T19:00:00',
 '2021-11-01T19:00:00',
 '2021-11-07T18:00:00',
 '2021-11-10T19:00:00',
 '2021-11-15T19:00:00',
 '2021-11-17T19:00:00',
 '2021-11-20T17:00:00',
 '2021-11-23T19:00:00',
 '2021-11-26T19:00:00',
 '2021-12-02T19:00:00',
 '2021-12-04T13:00:00',
 '2021-12-12T12:00:00',
 '2021-12-14T19:00:00',
 '2021-12-21T19:00:00',
 '2021-12-23T19:00:00',
 '2021-12-25T12:00:00',
 '2022-01-04T19:00:00',
 '2022-01-06T19:00:00',
 '2022-01-10T19:00:00',
 '2022-01-12T19:00:00',
 '2022-01-17T13:00:00',
 '2022-01-18T19:00:00',
 '2022-01-20T19:00:00',
 '2022-01-23T13:00:00',
 '2022-01-31T19:00:00',
 '2022-02-02T19:00:00',
 '2022-02-14T19:00:00',
 '2022-02-16T19:00:00',
 '2022-02-25T19:00:00',
 '2022-02-27T13:00:00',
 '2022-03-16T19:00:00',
 '2022-03-18T19:00:00',
 '2022-03-20T19:00:00',
 '2022-03-22T19:00:00',
 '2022-03-28T19:00:00',
 '2022-03-30T19:00:00',
 '2022-04-02T13:00:00',
 '2022-04-06T19:00:00',
 '2022-04-10T13:00:00']

And also let's create two functions to add to DataFrame with the purpose of creating new columns on it which works as regressor when the model is trained. Note that, in order to see the impact of a NBA match in the traffic state, it is not enough just getting the match start hour, but also a hour before to its beginning. We will include both hours in the DataFrame using below function.

In [21]:
nba_schedule_url = 'https://es.global.nba.com/stats2/team/schedule.json?countryCode=ES&locale=es&teamCode=knicks'

def game_in_madison_square_garden(date):
    with urllib.request.urlopen(nba_schedule_url) as url:
        data = json.loads(url.read().decode())
        
    home_dates = []
        
    for month in range(len(data['payload']['monthGroups'])):
        for match in range(len(data['payload']['monthGroups'][month]['games'])):
            if data['payload']['monthGroups'][month]['games'][match]['homeTeam']['profile']['abbr'] == 'NYK':
                home_dates.append(data['payload']['monthGroups'][month]['games'][match]['profile']['dateTimeEt'])

    home_dates = [dateutil.parser.isoparse(x).replace(minute = 0).isoformat() if dateutil.parser.isoparse(x).minute == 30 else dateutil.parser.isoparse(x).isoformat() for x in home_dates] # Check that datetimes follow datetime format.
    parsed_dates = [dateutil.parser.isoparse(x) for x in home_dates]
    previous_hour = [(x - timedelta(hours = 1)).isoformat() for x in parsed_dates]
    model_dates = home_dates + previous_hour
    if date in [model_dates]:
        return 1
    else:
        return 0
    
def is_weekend(ds):
    date = pd.to_datetime(ds)
    if date.weekday() > 4:
        return 1
    else:
        return 0

Let's also define two functions to calculate model error metrics: RMSE and MAE.

In [18]:
def rmse(actual: np.ndarray, predicted: np.ndarray):
    length = len(actual)
    return np.sqrt((sum((actual - predicted) ** 2)) / length)

def mae(actual: np.ndarray, predicted: np.ndarray):
    return np.mean(abs(actual - predicted))

We have defined all the required utils to train the model. Now, let's create the dataset for the first highway.

In [25]:
link_id = 1
query = f"SELECT * from tb_historical_traffic WHERE link_id = {link_id}"

db = DBConnection()
# Return the query to the local database as a DataFrame sorted by date.
df = db.perform_query(query).sort_values(by = 'measurement_date')

# Filter the columns that we need to the training and set the date as index to resample the DataFrame later.
df = df[['measurement_date', 'speed']]
df = df.set_index('measurement_date')
df.index = pd.to_datetime(df.index)

# Filter the outliers using the IQR approach.
q3, q1 = np.quantile(df['speed'], 0.75), np.quantile(df['speed'], 0.25)
iqr = q3 - q1
upper_limit, lower_limit = q3 + 1.5 * iqr, q1 - 1.5 * iqr
df = df[(df['speed'] > lower_limit) & (df['speed'] < upper_limit)]

# Resample the DataFrame by hour using the average of the observations.
df = df.resample('1H').mean()
df = df.reset_index(drop = False)

# Rename the columns to the names that Prophet supports.
df = df.rename(columns = {'measurement_date': 'ds', 'speed': 'y'})
df
Performing connection with database...
Connection successful !
Performing query to database...
Query finished !
Connection closed.
Out[25]:
ds y
0 2017-05-05 09:00:00 8.700000
1 2017-05-05 10:00:00 8.390000
2 2017-05-05 11:00:00 6.318333
3 2017-05-05 12:00:00 NaN
4 2017-05-05 13:00:00 3.110000
... ... ...
41961 2022-02-16 18:00:00 1.505714
41962 2022-02-16 19:00:00 3.257500
41963 2022-02-16 20:00:00 2.015000
41964 2022-02-16 21:00:00 3.257500
41965 2022-02-16 22:00:00 3.100000

41966 rows × 2 columns

This is how the DataFrame looks, however, we need to apply also the two functions created before to use the new columns as regressor in the training. Let's do it.

In [26]:
df['weekend'] = df['ds'].apply(is_weekend)
df['knicks_game'] = df['ds'].apply(match_in_madison_square_garden)
df
Out[26]:
ds y weekend knicks_game
0 2017-05-05 09:00:00 8.700000 0 0
1 2017-05-05 10:00:00 8.390000 0 0
2 2017-05-05 11:00:00 6.318333 0 0
3 2017-05-05 12:00:00 NaN 0 0
4 2017-05-05 13:00:00 3.110000 0 0
... ... ... ... ...
41961 2022-02-16 18:00:00 1.505714 0 0
41962 2022-02-16 19:00:00 3.257500 0 0
41963 2022-02-16 20:00:00 2.015000 0 0
41964 2022-02-16 21:00:00 3.257500 0 0
41965 2022-02-16 22:00:00 3.100000 0 0

41966 rows × 4 columns

Apart from weekends and matches in the Madison Square Garden, Prophet supports information about holidays. Let's create another DataFrame using the 'holidays' Python package.

In [27]:
import holidays
nyc_holidays = pd.DataFrame(holidays.CountryHoliday('US', state = 'NY', years = 2021).items()).rename(columns = {0: 'ds', 1: 'holiday'})
nyc_holidays
Out[27]:
ds holiday
0 2021-01-01 New Year's Day
1 2021-12-31 New Year's Day (Observed)
2 2021-01-18 Martin Luther King Jr. Day
3 2021-02-12 Lincoln's Birthday
4 2021-02-15 Washington's Birthday, Susan B. Anthony Day
5 2021-05-31 Memorial Day
6 2021-06-19 Juneteenth National Independence Day
7 2021-06-18 Confederate Memorial Day (Observed)
8 2021-07-04 Independence Day
9 2021-07-05 Independence Day (Observed)
10 2021-09-06 Labor Day
11 2021-10-11 Columbus Day
12 2021-11-02 Election Day
13 2021-11-11 Veterans Day
14 2021-11-25 Thanksgiving
15 2021-12-25 Christmas Day
16 2021-12-24 Christmas Day (Observed)

Cross Validation and Grid Search is performed here below. We have selected 3 different parameters to tune, having 18 different combinations in total. We predict in 6 different periods for each hyper parameter combination using a rolling window cross validation and after that, we take the average of the 6 periods RMSE. The lowest RMSE is the best one and its parameters combination associated to it is selected to test again the model on test data.

In [73]:
from fbprophet import Prophet
from fbprophet.serialize import model_to_json, model_from_json

param_grid = {'changepoint_prior_scale': [0.01, 0.1, 0.5], 
              'changepoint_range': [0.8, 0.9, 0.95], 
              'seasonality_mode': ['multiplicative', 'additive']}

# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

grid_search_df = pd.DataFrame()
avg_RMSEs = []

SAVE_MODEL = True

for params in all_params:
    print(f'############## {params} ##############')
    
    start_train_date = (datetime.now() - timedelta(days = 365)).replace(hour = 0, minute = 0, second = 0, microsecond = 0)
    end_train_date = start_train_date + timedelta(days = 30 * 8)
    
    stop_validation = end_train_date + timedelta(days = 30 * 3)
    
    RMSEs = {}
    period_n = 0
    
    while True:

        start_val_date = end_train_date
        end_val_date = start_val_date + timedelta(days = 14)

        if end_val_date > stop_validation:
            print(f'\nDate limit ({stop_validation}) to validate data reached!')
            end_val_date = start_val_date
            break
            
        print(f'Training data between {start_train_date}, and {end_train_date}')
            
        train_data = df[(df['ds'] > start_train_date.strftime('%Y-%m-%d')) & (df['ds'] < end_train_date.strftime('%Y-%m-%d'))]
        val_data = df[(df['ds'] > start_val_date.strftime('%Y-%m-%d')) & (df['ds'] < end_val_date.strftime('%Y-%m-%d'))]

        m = Prophet(**params, 
                    yearly_seasonality = True,
                    weekly_seasonality = True, 
                    daily_seasonality = True, 
                    holidays = nyc_holidays)

        m.add_regressor('weekend')
        m.add_regressor('knicks_game')

        m.fit(train_data)
        future = m.make_future_dataframe(periods = 360, freq = 'H')
        future['weekend'] = future['ds'].apply(is_weekend)
        future['knicks_game'] = future['ds'].apply(match_in_madison_square_garden)

        forecast = m.predict(future)
        forecast = forecast[(forecast['ds'] > start_val_date.isoformat()) & (forecast['ds'] < end_val_date.isoformat())]

        metrics_df = val_data.merge(right = forecast, how = 'inner', left_on = 'ds', right_on = 'ds')[['ds', 'y', 'yhat']].dropna()
        metrics_df = metrics_df.set_index('ds')

        metrics_df = metrics_df.dropna()

        actual, predicted = np.array(metrics_df['y'].values), np.array(metrics_df['yhat'].values)
        rmse_ = rmse(actual, predicted)
        
        RMSEs[period_n] = rmse_
        
        print(f'Validation metrics between {start_val_date} and {end_val_date} -> RMSE: {rmse_}\n')

        start_train_date = start_train_date + timedelta(days = 14)
        end_train_date = end_train_date + timedelta(days = 14)
        
        period_n += 1
    
    grid_search_df = grid_search_df.append(RMSEs, ignore_index = True)
    avg_rmse, std_rmse = np.round(np.mean([x for x in RMSEs.values()]), 3), np.round(np.std([x for x in RMSEs.values()]), 3)
    avg_RMSEs.append(avg_rmse)
    print('')
    print(f'->  Average RMSE using 14-days Cross Validation: {avg_rmse} ± {std_rmse}')
    print('')
    
print('All Cross Validation finished!')

grid_search_df['Avg. RMSE'] = avg_RMSEs
final_grid_search_df = pd.concat([pd.DataFrame(all_params), grid_search_df], axis = 1)

best_df = final_grid_search_df[final_grid_search_df.index == final_grid_search_df['Avg. RMSE'].idxmin()][[x for x in final_grid_search_df.columns if x not in [0, 1, 2, 3, 4, 5, 'Avg. RMSE']]]
best_params = best_df.to_dict(orient = 'records')[0]

print(f'\n --- Best params: {best_params} ---\n')

 # Test data after the latest validation performed !
print(f'Testing data after {end_val_date} with optimal model...')
test_data = df[df['ds'] > end_val_date]

if SAVE_MODEL:
    m = Prophet(**best_params, 
                yearly_seasonality = True,
                weekly_seasonality = True, 
                daily_seasonality = True, 
                holidays = nyc_holidays)

    m.add_regressor('weekend')
    m.add_regressor('knicks_game')
    m.fit(test_data)
    print('Done!')

    with open(f'C://Users//rcmpo//OneDrive//Escritorio//TFM//application//models//{link_id}.json', 'w') as fout:
        json.dump(model_to_json(m), fout)
else:
    with open(f'C://Users//rcmpo//OneDrive//Escritorio//TFM//application//models//{link_id}.json', 'r') as fin:
        m = model_from_json(json.load(fin))

future = m.make_future_dataframe(periods = 360, freq = 'H')
future['weekend'] = future['ds'].apply(is_weekend)
future['knicks_game'] = future['ds'].apply(match_in_madison_square_garden)

forecast = m.predict(future)
predictions = test_data.merge(right = forecast[['ds', 'yhat']], how = 'inner', left_on = 'ds', right_on = 'ds')[['ds', 'y', 'yhat']].dropna()

print(f"RMSE for test data: {rmse(predictions['y'].values, predictions['yhat'].values)}")

predictions['mae'] = abs(predictions['y'] - predictions['yhat'])
print(f"MAE for test data: {np.mean(predictions['mae'])}")
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 3.548646658205556

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.613006458084591

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 3.632000382607575

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 8.0269451308222

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 10.388647247407453

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.406160683206686


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 6.603 ± 2.421

############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 4.652056423383575

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 5.473010079561313

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.1575973841360785

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 12.604162859000365

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 10.642418533833725

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.301993736749074


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 7.805 ± 2.992

############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 3.5461079281709367

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.28283493551903

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 3.997358557614436

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.474140878055749

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 10.089199286577944

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 6.897272828262811


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 6.214 ± 2.144

############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 3.8670076830877917

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 5.63497170509369

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.455861516143279

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 12.607277360765417

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.868039423336139

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.399785204990593


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 7.472 ± 2.96

############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 3.55927597579024

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 4.320758976566218

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 3.6552906226437396

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 8.65193814350648

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.59272884282056

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 6.825586256132988


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 6.101 ± 2.41

############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 5.176063481075989

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 5.246194067116321

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.60541189659985

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 12.746027506393263

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 10.849700439219339

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.064610372436119


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 7.781 ± 2.959

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.438099449051967

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 15.225417004775556

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.461090911306057

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 25.670853467143967

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 10.511810667588907

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 15.661904246516121


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 13.162 ± 6.813

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 23.088028702679495

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 54.26831727551163

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.649864281644529

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 80.48008489074928

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 39.18562086718136

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 25.554544778810417


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 38.538 ± 23.479

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 16.166773733282824

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 12.262770322869365

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 4.208400666189378

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 22.095670830857987

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 5.909290497636558

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 16.419447805969714


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 12.844 ± 6.224

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 23.697288653402477

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 37.18942994202489

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.945632125009102

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 80.36649015106038

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 55.52353299375275

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 33.161773834311475


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 39.981 ± 22.709

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 9.601623318359167

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 14.360614350237798

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.317535200236071

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 17.67919410715513

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 6.44147173839853

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 15.296656862496901


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 11.45 ± 4.622

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 45.622545586315944

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 53.91577969924726

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.255825168069293

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 47.93112279623359

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 18.919766147483966

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 9.602383949918279


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 30.875 ± 18.718

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 8.605729360248946

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 7.247944327539396

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 6.3039927163062055

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 26.199906638649587

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 7.899615446808091

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 15.955709437681783


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 12.035 ± 7.079

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 32.08959856678633

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 4.5904067082829485

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 18.704632000261213

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 55.88247304266387

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.645499836538812

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 13.142762449128305


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 22.343 ± 17.292

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 30.597074777247894

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 5.7715415827584495

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 6.196917717720455

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 20.99420444917006

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 5.337882104958939

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 16.756685997785464


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 14.276 ± 9.444

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 8.504746353897035

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 4.690421641254657

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 40.23232143331802

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 64.84724341570151

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 24.466934146201954

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 25.374508972432185


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 28.019 ± 20.208

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 27.286764449027377

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 25.73645834990501

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 5.8374698003813545

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 15.03448071920315

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 4.507477424801021

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 20.003246457253233


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 16.401 ± 8.884

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 44.751237948803414

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 36.87242700158772

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.107100784828411

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 66.8029758882345

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 17.419691444545155

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 23.26918471165233


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 33.037 ± 19.166

All Cross Validation finished!

 --- Best params: {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ---

Testing data after 2022-01-16 00:00:00 with optimal model...
Done!
RMSE for test data: 3.14451660725604
MAE for test data: 2.1124367023432167

The final results of the model training are collected in the following DataFrame (0, 1, 2, 3, 4 and 5 are the different periods using for cross validation)

In [74]:
final_grid_search_df
Out[74]:
changepoint_prior_scale changepoint_range seasonality_mode 0 1 2 3 4 5 Avg. RMSE
0 0.01 0.80 multiplicative 3.548647 6.613006 3.632000 8.026945 10.388647 7.406161 6.603
1 0.01 0.80 additive 4.652056 5.473010 5.157597 12.604163 10.642419 8.301994 7.805
2 0.01 0.90 multiplicative 3.546108 6.282835 3.997359 6.474141 10.089199 6.897273 6.214
3 0.01 0.90 additive 3.867008 5.634972 5.455862 12.607277 9.868039 7.399785 7.472
4 0.01 0.95 multiplicative 3.559276 4.320759 3.655291 8.651938 9.592729 6.825586 6.101
5 0.01 0.95 additive 5.176063 5.246194 5.605412 12.746028 10.849700 7.064610 7.781
6 0.10 0.80 multiplicative 6.438099 15.225417 5.461091 25.670853 10.511811 15.661904 13.162
7 0.10 0.80 additive 23.088029 54.268317 8.649864 80.480085 39.185621 25.554545 38.538
8 0.10 0.90 multiplicative 16.166774 12.262770 4.208401 22.095671 5.909290 16.419448 12.844
9 0.10 0.90 additive 23.697289 37.189430 9.945632 80.366490 55.523533 33.161774 39.981
10 0.10 0.95 multiplicative 9.601623 14.360614 5.317535 17.679194 6.441472 15.296657 11.450
11 0.10 0.95 additive 45.622546 53.915780 9.255825 47.931123 18.919766 9.602384 30.875
12 0.50 0.80 multiplicative 8.605729 7.247944 6.303993 26.199907 7.899615 15.955709 12.035
13 0.50 0.80 additive 32.089599 4.590407 18.704632 55.882473 9.645500 13.142762 22.343
14 0.50 0.90 multiplicative 30.597075 5.771542 6.196918 20.994204 5.337882 16.756686 14.276
15 0.50 0.90 additive 8.504746 4.690422 40.232321 64.847243 24.466934 25.374509 28.019
16 0.50 0.95 multiplicative 27.286764 25.736458 5.837470 15.034481 4.507477 20.003246 16.401
17 0.50 0.95 additive 44.751238 36.872427 9.107101 66.802976 17.419691 23.269185 33.037

Best parameters are:

  • changepoint_prior_scale = 0.01
  • changepoint_range = 0.95
  • seasonality_mode = multiplicative

And with these parameters we get a metrics on test data of:

  • RMSE = 3.144
  • MAE = 2.112

Let's export the DataFrame in order to use it in the dissertation.

In [76]:
import dataframe_image as dfi
dfi.export(final_grid_search_df, r'C:\Users\rcmpo\OneDrive\Escritorio\TFM\memory_utils\final_grid_search_df.png', max_rows = 20)

Finally, let's compare the results with a seasonality Naïve and a 3-weeks seasonality Moving average model:

In [78]:
naive = pd.concat([val_data, test_data]).reset_index(drop = True)
naive = naive[naive['ds'] > '2022-01-14 00:00:00']
naive = naive.set_index('ds')
naive = naive.sort_index()
naive['yhat'] = naive['y'].shift(periods = 24, freq = 'H')

naive = naive.dropna()

naive['mae'] = abs(naive['y'] - naive['yhat'])

print(f"MAE for test data: {np.mean(naive['mae'])}")
rmse_ = rmse(naive['y'].values, naive['yhat'].values)
print(f"RMSE for test data: {rmse_}")
MAE for test data: 2.3880420414116545
RMSE for test data: 3.759215566389097
In [79]:
def get_previous_dates_given_a_date(date):
    dates = [date - timedelta(days = 7), date - timedelta(days = 14), date - timedelta(days = 21)]
    return dates

pd.options.display.max_rows = 20
moving_avg = pd.concat([train_data, val_data, test_data]).reset_index(drop = True)
moving_avg = moving_avg.drop(columns = ['weekend', 'knicks_game'])
moving_avg['yhat'] = None
moving_avg = moving_avg.set_index('ds')
moving_avg = moving_avg.sort_index()
moving_avg = moving_avg[moving_avg.index > '2021-12-01']

for idx in moving_avg.index:
    previous_dates = get_previous_dates_given_a_date(idx)
    speeds = moving_avg[moving_avg.index.isin(previous_dates)]['y'].values
    avg = np.mean([x for x in speeds if str(x) != 'nan'])
    moving_avg.at[idx, 'yhat'] = avg
moving_avg = moving_avg[moving_avg.index > '2022-01-15 00:00:00']
moving_avg = moving_avg.dropna()

moving_avg['mae'] = abs(moving_avg['y'] - moving_avg['yhat'])
print(f"MAE for test data: {np.mean(moving_avg['mae'])}")
rmse_ = rmse(moving_avg['y'].values, moving_avg['yhat'].values)
print(f"RMSE for test data: {rmse_}")
C:\Users\rcmpo\Anaconda3\envs\tfm_\lib\site-packages\numpy\core\fromnumeric.py:3440: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
C:\Users\rcmpo\Anaconda3\envs\tfm_\lib\site-packages\numpy\core\_methods.py:189: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
MAE for test data: 2.2335440682768026
RMSE for test data: 3.578375165987913
In [80]:
final_comparison = pd.DataFrame(index = ['MAE', 'RMSE'])

final_comparison['Prophet'] = [2.112, 3.144]
final_comparison['Naïve'] = [2.388, 3.759]
final_comparison['Moving average'] = [2.233, 3.578]

dfi.export(final_comparison, r'C:\Users\rcmpo\OneDrive\Escritorio\TFM\memory_utils\test_comparison.png')
In [81]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x = predictions.ds, y = predictions.y, name = 'Real'))
fig.add_trace(go.Scatter(x = predictions.ds, y = predictions.yhat, name = 'Prophet'))
fig.add_trace(go.Scatter(x = naive.index, y = naive.yhat, name = 'Naïve', marker = dict(color = 'orange')))
fig.add_trace(go.Scatter(x = moving_avg.index, y = moving_avg.yhat, name = 'Moving average', marker = dict(color = 'crimson')))
fig.show()

Now let's repeat the whole process with the second highway.

In [82]:
link_id = 365
query = f"SELECT * from tb_historical_traffic WHERE link_id = {link_id}"

db = DBConnection()
# Return the query to the local database as a DataFrame sorted by date.
df = db.perform_query(query).sort_values(by = 'measurement_date')

# Filter the columns that we need to the training and set the date as index to resample the DataFrame later.
df = df[['measurement_date', 'speed']]
df = df.set_index('measurement_date')
df.index = pd.to_datetime(df.index)

# Filter the outliers using the IQR approach.
q3, q1 = np.quantile(df['speed'], 0.75), np.quantile(df['speed'], 0.25)
iqr = q3 - q1
upper_limit, lower_limit = q3 + 1.5 * iqr, q1 - 1.5 * iqr
df = df[(df['speed'] > lower_limit) & (df['speed'] < upper_limit)]

# Resample the DataFrame by hour using the average of the observations.
df = df.resample('1H').mean()
df = df.reset_index(drop = False)

# Rename the columns to the names that Prophet supports.
df = df.rename(columns = {'measurement_date': 'ds', 'speed': 'y'})
df
Performing connection with database...
Connection successful !
Performing query to database...
Query finished !
Connection closed.
Out[82]:
ds y
0 2016-12-28 09:00:00 14.910000
1 2016-12-28 10:00:00 NaN
2 2016-12-28 11:00:00 NaN
3 2016-12-28 12:00:00 NaN
4 2016-12-28 13:00:00 NaN
... ... ...
45033 2022-02-16 18:00:00 11.335833
45034 2022-02-16 19:00:00 20.344167
45035 2022-02-16 20:00:00 25.264167
45036 2022-02-16 21:00:00 22.107500
45037 2022-02-16 22:00:00 26.405000

45038 rows × 2 columns

In [83]:
df['weekend'] = df['ds'].apply(is_weekend)
df['knicks_game'] = df['ds'].apply(match_in_madison_square_garden)
df
Out[83]:
ds y weekend knicks_game
0 2016-12-28 09:00:00 14.910000 0 0
1 2016-12-28 10:00:00 NaN 0 0
2 2016-12-28 11:00:00 NaN 0 0
3 2016-12-28 12:00:00 NaN 0 0
4 2016-12-28 13:00:00 NaN 0 0
... ... ... ... ...
45033 2022-02-16 18:00:00 11.335833 0 0
45034 2022-02-16 19:00:00 20.344167 0 0
45035 2022-02-16 20:00:00 25.264167 0 0
45036 2022-02-16 21:00:00 22.107500 0 0
45037 2022-02-16 22:00:00 26.405000 0 0

45038 rows × 4 columns

In [85]:
from fbprophet import Prophet
from fbprophet.serialize import model_to_json, model_from_json

param_grid = {'changepoint_prior_scale': [0.01, 0.1, 0.5], 
              'changepoint_range': [0.8, 0.9, 0.95], 
              'seasonality_mode': ['multiplicative', 'additive']}

# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

grid_search_df = pd.DataFrame()
avg_RMSEs = []

SAVE_MODEL = True

for params in all_params:
    print(f'############## {params} ##############')
    
    start_train_date = (datetime.now() - timedelta(days = 365)).replace(hour = 0, minute = 0, second = 0, microsecond = 0)
    end_train_date = start_train_date + timedelta(days = 30 * 8)
    
    stop_validation = end_train_date + timedelta(days = 30 * 3)
    
    RMSEs = {}
    period_n = 0
    
    while True:

        start_val_date = end_train_date
        end_val_date = start_val_date + timedelta(days = 14)

        if end_val_date > stop_validation:
            print(f'\nDate limit ({stop_validation}) to validate data reached!')
            end_val_date = start_val_date
            break
            
        print(f'Training data between {start_train_date}, and {end_train_date}')
            
        train_data = df[(df['ds'] > start_train_date.strftime('%Y-%m-%d')) & (df['ds'] < end_train_date.strftime('%Y-%m-%d'))]
        val_data = df[(df['ds'] > start_val_date.strftime('%Y-%m-%d')) & (df['ds'] < end_val_date.strftime('%Y-%m-%d'))]

        m = Prophet(**params, 
                    yearly_seasonality = True,
                    weekly_seasonality = True, 
                    daily_seasonality = True, 
                    holidays = nyc_holidays)

        m.add_regressor('weekend')
        m.add_regressor('knicks_game')

        m.fit(train_data)
        future = m.make_future_dataframe(periods = 360, freq = 'H')
        future['weekend'] = future['ds'].apply(is_weekend)
        future['knicks_game'] = future['ds'].apply(match_in_madison_square_garden)

        forecast = m.predict(future)
        forecast = forecast[(forecast['ds'] > start_val_date.isoformat()) & (forecast['ds'] < end_val_date.isoformat())]

        metrics_df = val_data.merge(right = forecast, how = 'inner', left_on = 'ds', right_on = 'ds')[['ds', 'y', 'yhat']].dropna()
        metrics_df = metrics_df.set_index('ds')

        metrics_df = metrics_df.dropna()

        actual, predicted = np.array(metrics_df['y'].values), np.array(metrics_df['yhat'].values)
        rmse_ = rmse(actual, predicted)
        
        RMSEs[period_n] = rmse_
        
        print(f'Validation metrics between {start_val_date} and {end_val_date} -> RMSE: {rmse_}\n')

        start_train_date = start_train_date + timedelta(days = 14)
        end_train_date = end_train_date + timedelta(days = 14)
        
        period_n += 1
    
    grid_search_df = grid_search_df.append(RMSEs, ignore_index = True)
    avg_rmse, std_rmse = np.round(np.mean([x for x in RMSEs.values()]), 3), np.round(np.std([x for x in RMSEs.values()]), 3)
    avg_RMSEs.append(avg_rmse)
    print('')
    print(f'->  Average RMSE using 14-days Cross Validation: {avg_rmse} ± {std_rmse}')
    print('')
    
print('All Cross Validation finished!')

grid_search_df['Avg. RMSE'] = avg_RMSEs
final_grid_search_df = pd.concat([pd.DataFrame(all_params), grid_search_df], axis = 1)

best_df = final_grid_search_df[final_grid_search_df.index == final_grid_search_df['Avg. RMSE'].idxmin()][[x for x in final_grid_search_df.columns if x not in [0, 1, 2, 3, 4, 5, 'Avg. RMSE']]]
best_params = best_df.to_dict(orient = 'records')[0]

print(f'\n --- Best params: {best_params} ---\n')

 # Test data after the latest validation performed !
print(f'Testing data after {end_val_date} with optimal model...')
test_data = df[df['ds'] > end_val_date]

if SAVE_MODEL:
    m = Prophet(**best_params, 
                yearly_seasonality = True,
                weekly_seasonality = True, 
                daily_seasonality = True, 
                holidays = nyc_holidays)

    m.add_regressor('weekend')
    m.add_regressor('knicks_game')
    m.fit(test_data)
    print('Done!')

    with open(f'C://Users//rcmpo//OneDrive//Escritorio//TFM//application//models//{link_id}.json', 'w') as fout:
        json.dump(model_to_json(m), fout)
else:
    with open(f'C://Users//rcmpo//OneDrive//Escritorio//TFM//application//models//{link_id}.json', 'r') as fin:
        m = model_from_json(json.load(fin))

future = m.make_future_dataframe(periods = 360, freq = 'H')
future['weekend'] = future['ds'].apply(is_weekend)
future['knicks_game'] = future['ds'].apply(match_in_madison_square_garden)

forecast = m.predict(future)
predictions = test_data.merge(right = forecast[['ds', 'yhat']], how = 'inner', left_on = 'ds', right_on = 'ds')[['ds', 'y', 'yhat']].dropna()

print(f"RMSE for test data: {rmse(predictions['y'].values, predictions['yhat'].values)}")

predictions['mae'] = abs(predictions['y'] - predictions['yhat'])
print(f"MAE for test data: {np.mean(predictions['mae'])}")
############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.661891882991734

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.9352011806155796

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.140181762028284

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.687998223523513

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.446084037043216

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.076470178501161


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 7.491 ± 1.213

############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.891270214966242

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.459753379856594

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.930001107405019

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.429871017177495

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 7.936014849005898

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.449459191279394


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 7.349 ± 0.885

############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.637427200394125

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.950098103892979

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.282869936233519

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.581853058915747

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.504252694932129

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.992698501993131


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 7.492 ± 1.265

############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.8845543094736845

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.475015622215044

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.792170371372002

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.271316471562356

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 7.880442851262315

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.430348066788547


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 7.289 ± 0.865

############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.621319247315348

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.779394265598186

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.002164000190573

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.604490587257373

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 9.60810127579831

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.962599177699075


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 7.43 ± 1.276

############## {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.865071352204061

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.445237958139613

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.66352111488622

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.299018617372367

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 7.959561791137905

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 7.618813063420301


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 7.309 ± 0.848

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.989328841041767

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 7.190773659776363

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.714018630427699

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.783276451614362

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.93528301143299

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.628398329724636


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 10.04 ± 5.854

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.807099194154355

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 7.025946895722976

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.51758474698422

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.6143543282507515

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 20.200223115251287

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.926058216556061


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 9.849 ± 4.757

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.964467961913226

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 7.214718572366223

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.67644195426523

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.689073892909414

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.90464600021188

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 11.157711578812698


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 10.435 ± 5.832

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.807138482595291

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 7.049733589565095

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.593984454244891

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.647980459650669

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 17.277499095806817

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.889886178457935


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 9.378 ± 3.701

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.943724491538316

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 9.146758103306716

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 8.673010714004679

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 5.75360633367981

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 23.78231270475493

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 13.48918456766125


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 11.298 ± 6.081

############## {'changepoint_prior_scale': 0.1, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.808144009721109

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 6.995379546738166

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.536900364427702

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 6.610146363907281

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 20.84598350594019

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 8.656543244447256


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 9.909 ± 5.005

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.8, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 7.663500325933629

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 21.443038300285114

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 12.821141731667423

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 21.287214232697533

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.16215545585089

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 13.576774706801904


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 16.492 ± 5.471

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.8, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.895868414646396

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 22.208226184036253

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.562404700079158

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 46.245246420636946

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.339782599449375

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 11.11897506673142


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 19.728 ± 13.281

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.9, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 10.317548914600524

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 20.77383418041363

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 11.719932983955722

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 20.608933445969893

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 21.16414388188165

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 14.515269914681356


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 16.517 ± 4.508

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.87653301645642

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 27.08741334705353

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.51255761562299

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 28.471372300048365

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 23.85185625661355

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 11.287479842878222


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 17.848 ± 8.824

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.95, 'seasonality_mode': 'multiplicative'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 7.1987690439216845

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 23.396497293317683

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 12.030027062259514

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 16.5733663737021

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.41944673865343

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 14.628800480129934


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 16.041 ± 5.647

############## {'changepoint_prior_scale': 0.5, 'changepoint_range': 0.95, 'seasonality_mode': 'additive'} ##############
Training data between 2021-02-26 00:00:00, and 2021-10-24 00:00:00
Validation metrics between 2021-10-24 00:00:00 and 2021-11-07 00:00:00 -> RMSE: 6.876698617622195

Training data between 2021-03-12 00:00:00, and 2021-11-07 00:00:00
Validation metrics between 2021-11-07 00:00:00 and 2021-11-21 00:00:00 -> RMSE: 30.21281677372825

Training data between 2021-03-26 00:00:00, and 2021-11-21 00:00:00
Validation metrics between 2021-11-21 00:00:00 and 2021-12-05 00:00:00 -> RMSE: 9.576718323577156

Training data between 2021-04-09 00:00:00, and 2021-12-05 00:00:00
Validation metrics between 2021-12-05 00:00:00 and 2021-12-19 00:00:00 -> RMSE: 37.76287014793415

Training data between 2021-04-23 00:00:00, and 2021-12-19 00:00:00
Validation metrics between 2021-12-19 00:00:00 and 2022-01-02 00:00:00 -> RMSE: 22.135387354270698

Training data between 2021-05-07 00:00:00, and 2022-01-02 00:00:00
Validation metrics between 2022-01-02 00:00:00 and 2022-01-16 00:00:00 -> RMSE: 11.072309959353191


Date limit (2022-01-22 00:00:00) to validate data reached!

->  Average RMSE using 14-days Cross Validation: 19.606 ± 11.431

All Cross Validation finished!

 --- Best params: {'changepoint_prior_scale': 0.01, 'changepoint_range': 0.9, 'seasonality_mode': 'additive'} ---

Testing data after 2022-01-16 00:00:00 with optimal model...
Done!
RMSE for test data: 6.504893607080428
MAE for test data: 4.930203972191617

And for this second link road these are the results:

In [86]:
final_grid_search_df
Out[86]:
changepoint_prior_scale changepoint_range seasonality_mode 0 1 2 3 4 5 Avg. RMSE
0 0.01 0.80 multiplicative 6.661892 6.935201 8.140182 5.687998 9.446084 8.076470 7.491
1 0.01 0.80 additive 6.891270 6.459753 8.930001 6.429871 7.936015 7.449459 7.349
2 0.01 0.90 multiplicative 6.637427 6.950098 8.282870 5.581853 9.504253 7.992699 7.492
3 0.01 0.90 additive 6.884554 6.475016 8.792170 6.271316 7.880443 7.430348 7.289
4 0.01 0.95 multiplicative 6.621319 6.779394 8.002164 5.604491 9.608101 7.962599 7.430
5 0.01 0.95 additive 6.865071 6.445238 8.663521 6.299019 7.959562 7.618813 7.309
6 0.10 0.80 multiplicative 6.989329 7.190774 8.714019 5.783276 22.935283 8.628398 10.040
7 0.10 0.80 additive 6.807099 7.025947 9.517585 6.614354 20.200223 8.926058 9.849
8 0.10 0.90 multiplicative 6.964468 7.214719 8.676442 5.689074 22.904646 11.157712 10.435
9 0.10 0.90 additive 6.807138 7.049734 9.593984 6.647980 17.277499 8.889886 9.378
10 0.10 0.95 multiplicative 6.943724 9.146758 8.673011 5.753606 23.782313 13.489185 11.298
11 0.10 0.95 additive 6.808144 6.995380 9.536900 6.610146 20.845984 8.656543 9.909
12 0.50 0.80 multiplicative 7.663500 21.443038 12.821142 21.287214 22.162155 13.576775 16.492
13 0.50 0.80 additive 6.895868 22.208226 9.562405 46.245246 22.339783 11.118975 19.728
14 0.50 0.90 multiplicative 10.317549 20.773834 11.719933 20.608933 21.164144 14.515270 16.517
15 0.50 0.90 additive 6.876533 27.087413 9.512558 28.471372 23.851856 11.287480 17.848
16 0.50 0.95 multiplicative 7.198769 23.396497 12.030027 16.573366 22.419447 14.628800 16.041
17 0.50 0.95 additive 6.876699 30.212817 9.576718 37.762870 22.135387 11.072310 19.606

Note that in this case results are worse than before.

Best parameters are:

  • changepoint_prior_scale = 0.01
  • changepoint_range = 0.90
  • seasonality_mode = additive

And with these parameters we get a metrics on test data of:

  • RMSE = 6.505
  • MAE = 4.930

Let's export again this second DataFrame:

In [87]:
import dataframe_image as dfi
dfi.export(final_grid_search_df, r'C:\Users\rcmpo\OneDrive\Escritorio\TFM\memory_utils\final_grid_search_df_2.png', max_rows = 20)

Let's compare again the results with the other 2 models:

In [88]:
naive = pd.concat([val_data, test_data]).reset_index(drop = True)
naive = naive[naive['ds'] > '2022-01-14 00:00:00']
naive = naive.set_index('ds')
naive = naive.sort_index()
naive['yhat'] = naive['y'].shift(periods = 24, freq = 'H')

naive = naive.dropna()

naive['mae'] = abs(naive['y'] - naive['yhat'])

print(f"MAE for test data: {np.mean(naive['mae'])}")
rmse_ = rmse(naive['y'].values, naive['yhat'].values)
print(f"RMSE for test data: {rmse_}")
MAE for test data: 6.1816020670806155
RMSE for test data: 8.27205117029201
In [89]:
moving_avg = pd.concat([train_data, val_data, test_data]).reset_index(drop = True)
moving_avg = moving_avg.drop(columns = ['weekend', 'knicks_game'])
moving_avg['yhat'] = None
moving_avg = moving_avg.set_index('ds')
moving_avg = moving_avg.sort_index()
moving_avg = moving_avg[moving_avg.index > '2021-12-01']

for idx in moving_avg.index:
    previous_dates = get_previous_dates_given_a_date(idx)
    speeds = moving_avg[moving_avg.index.isin(previous_dates)]['y'].values
    avg = np.mean([x for x in speeds if str(x) != 'nan'])
    moving_avg.at[idx, 'yhat'] = avg
moving_avg = moving_avg[moving_avg.index > '2022-01-15 00:00:00']
moving_avg = moving_avg.dropna()

moving_avg['mae'] = abs(moving_avg['y'] - moving_avg['yhat'])
print(f"MAE for test data: {np.mean(moving_avg['mae'])}")
rmse_ = rmse(moving_avg['y'].values, moving_avg['yhat'].values)
print(f"RMSE for test data: {rmse_}")
C:\Users\rcmpo\Anaconda3\envs\tfm_\lib\site-packages\numpy\core\fromnumeric.py:3440: RuntimeWarning:

Mean of empty slice.

C:\Users\rcmpo\Anaconda3\envs\tfm_\lib\site-packages\numpy\core\_methods.py:189: RuntimeWarning:

invalid value encountered in double_scalars

MAE for test data: 5.743760804026095
RMSE for test data: 7.870923289096443

And let's see the final comparison. Prophet is again the best alternative.

In [90]:
final_comparison = pd.DataFrame(index = ['MAE', 'RMSE'])

final_comparison['Prophet'] = [4.930, 6.505]
final_comparison['Naïve'] = [6.182, 8.272]
final_comparison['Moving average'] = [5.744, 7.871]

dfi.export(final_comparison, r'C:\Users\rcmpo\OneDrive\Escritorio\TFM\memory_utils\test_comparison_2.png')
final_comparison
Out[90]:
Prophet Naïve Moving average
MAE 4.930 6.182 5.744
RMSE 6.505 8.272 7.871
In [91]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x = predictions.ds, y = predictions.y, name = 'Real'))
fig.add_trace(go.Scatter(x = predictions.ds, y = predictions.yhat, name = 'Prophet'))
fig.add_trace(go.Scatter(x = naive.index, y = naive.yhat, name = 'Naïve', marker = dict(color = 'orange')))
fig.add_trace(go.Scatter(x = moving_avg.index, y = moving_avg.yhat, name = 'Moving average', marker = dict(color = 'crimson')))
fig.show()
In [ ]: